Clustering ASVs

Code
library(dplyr) ; library(tidyr) ; library(ggplot2) 
library(doParallel) ; library(foreach) ; library(doSNOW)

root <- rprojroot::has_file(".git/index")
datadir = root$find_file("data")
funsdir = root$find_file("functions")
savingdir = root$find_file("saved_files")
savingdirOceanSlices = root$find_file("saved_files_oceanSlices")

datapath = root$find_file(paste0(datadir,'/','grump.phaeocystis_asv_long.csv'))
files_vec <- list.files(funsdir)
currentwd <- getwd()

for( i in 1:length(files_vec)){
  source(root$find_file(paste0(funsdir,'/',files_vec[i])))
}

dframe = data.table::fread(input = datapath) %>% filter(Cruise!="MOSAiC",Raw.Sequence.Counts>0)
dframe_allASVs = tidy_grump(Dframe = dframe)

The idea is to create a ‘custom’ distance matrix, to induce the grouppings.

  • \(c_1\) is within the same species - but no unassigned
  • \(c_2\) is between assigned species
  • \(c_3\) is between species and unassigned
  • \(c_4\) is within unassigned and unassigned

For this document we will use only the following configuration:

\(c_1=1\), \(c_2=1000\), \(c_3=10\) , \(c_4=10\)

A priori, the ’probability` of unassigned ASVs to agglomerate between themselves is the same as if it was to agglomerate with other species. Large \(c_2\) makes it harder for known species ASVs to group between themselves.

Code
inducedDist_ = inducedDist(
  dFrame = dframe_allASVs$dframe,
  c1 = 1,c2=1000,c3=10,c4=10,
  compMatrix = dframe_allASVs$ASV_composition)

If we exclude the ASVs that appears only one ~or two~ times, does it help?

Code
nASVs <- dframe_allASVs$dframe %>% select(ASV_name) %>% distinct() %>% nrow()

plot0s_summ = dframe_allASVs$dframe %>% group_by(ASV_name,SampleKey) %>% 
  summarise(Freq=n()) %>%
  mutate(nSamplesObserved = sum(Freq)) %>% 
  arrange(nSamplesObserved) %>% 
  select(ASV_name,nSamplesObserved) %>% distinct() %>%
  mutate(FreqNSamples=cut(nSamplesObserved,breaks = c(0,1,2,3,5,10,25,50,100,250,1500),right = T,include.lowest = T)) %>% 
  group_by(FreqNSamples) %>% 
  summarise(Freq=n(),Pct=n()/nASVs) %>% 
  mutate(CummPct = cumsum(Pct))
  

plot0s_summ %>% ggplot(aes(x=FreqNSamples,y=Freq))+geom_bar(stat='identity')

Code
plot0s_summ %>% ggplot(aes(x=FreqNSamples,y=CummPct))+geom_bar(stat='identity')

Code
plot0s_summ %>% knitr::kable() %>% kableExtra::kable_styling()
FreqNSamples Freq Pct CummPct
[0,1] 715 0.4824561 0.4824561
(1,2] 169 0.1140351 0.5964912
(2,3] 80 0.0539811 0.6504723
(3,5] 103 0.0695007 0.7199730
(5,10] 121 0.0816464 0.8016194
(10,25] 129 0.0870445 0.8886640
(25,50] 67 0.0452092 0.9338731
(50,100] 48 0.0323887 0.9662618
(100,250] 41 0.0276653 0.9939271
(250,1.5e+03] 9 0.0060729 1.0000000
Code
### Creating vectors with id of ASVs to remove:
ASVs2Remove_df = dframe_allASVs$dframe %>% group_by(ASV_name,SampleKey) %>% 
  summarise(Freq=n()) %>%
  mutate(nSamplesObserved = sum(Freq)) %>% 
  arrange(nSamplesObserved) %>% 
  select(ASV_name,nSamplesObserved) %>% distinct() 

vetASVs_2orMore = ASVs2Remove_df %>% filter(nSamplesObserved<2) %>% select(ASV_name) %>% pull()
vetASVs_3orMore = ASVs2Remove_df %>% filter(nSamplesObserved<3) %>% select(ASV_name) %>% pull()

saveRDS(vetASVs_2orMore,file = paste0(savingdirOceanSlices,'/','asvs_to_remove_oneSample'))
saveRDS(vetASVs_3orMore,file = paste0(savingdirOceanSlices,'/','asvs_to_remove_twoSample'))

This is the number of ASVs that are preset in exactly that amount of samples.

Lets look now the number of samples with how many ASVs,

Code
## now looking ASVs inside samples
nsamples = dframe_allASVs$dframe %>% select(SampleID) %>% distinct() %>% nrow()
plot0s_summ2 = dframe_allASVs$dframe %>% 
  group_by(SampleKey) %>% 
  summarise(NumbersOfAsvInEachSample=n()) %>% 
  arrange(NumbersOfAsvInEachSample) %>% 
  mutate(FreqAsvs=cut(NumbersOfAsvInEachSample,breaks = c(0,1,2,3,5,10,25,50,100,250,500),
                      right = T,include.lowest = T)) %>% 
  group_by(FreqAsvs) %>% 
  summarise(Freq=n(),Pct=n()/nsamples) %>% 
  mutate(CummPct = cumsum(Pct))

plot0s_summ2 %>% knitr::kable() %>% kableExtra::kable_styling()
FreqAsvs Freq Pct CummPct
[0,1] 90 0.0911854 0.0911854
(1,2] 58 0.0587639 0.1499493
(2,3] 36 0.0364742 0.1864235
(3,5] 62 0.0628166 0.2492401
(5,10] 161 0.1631206 0.4123607
(10,25] 243 0.2462006 0.6585613
(25,50] 253 0.2563323 0.9148936
(50,100] 82 0.0830800 0.9979737
(100,250] 2 0.0020263 1.0000000

Lets now compare the distance matrix using mds, ltns, and umap.

Code
dim_reduce_obj <- dim_reduce_plots(dfComposition = dframe_allASVs$ASV_composition)

dim_reduce_obj$MDS2d_ASVs_Ait

Code
dim_reduce_obj$TSNE_ASVs_Ait_2d

Code
dim_reduce_obj$umap_ASVs_Ait_2d

Code
dframe2plus = tidy_grump(Dframe = dframe,vet_ASVs2remove = vetASVs_2orMore)
dim(dframe2plus$ASV_composition)
[1] 767 975

We have 767 ASVs and 975 samples

Code
dim_reduce_obj2plus <- dim_reduce_plots(
  dfComposition = dframe2plus$ASV_composition,perplexityTsne = 100)

dim_reduce_obj2plus$MDS2d_ASVs_Ait

Code
dim_reduce_obj2plus$TSNE_ASVs_Ait_2d

Code
dim_reduce_obj2plus$umap_ASVs_Ait_2d

Code
dframe3plus = tidy_grump(Dframe = dframe,vet_ASVs2remove = vetASVs_3orMore)

dim(dframe3plus$ASV_composition)
[1] 598 971

598 ASVs and 971 samples.

Code
dim_reduce_obj3plus <- dim_reduce_plots(dfComposition = dframe3plus$ASV_composition,perplexityTsne = 100)
dim_reduce_obj3plus$MDS2d_ASVs_Ait

Code
dim_reduce_obj3plus$TSNE_ASVs_Ait_2d

Code
dim_reduce_obj3plus$umap_ASVs_Ait_2d

It feels that removing rare ASVs won’t help..

Here the idea is to aggregate some samples. So we would have compositions of ASVs but instead of samples we would have a less sparse and with less dimensions. The most natural could be:

  1. Compositions of ASVs, under longhurst provinces
  2. Compositions of ASVs, under longhurst provinces combined with different depths
  3. Cruises?
  4. Chunks of latitudes, and chunks of depths.

So for each aggregation we should take a look at mds, tsne, umap of the ait distances (or distance matrix using the CLR transformation)

So first lets do a bit of EDA to see what we can expect.

Code
dframe_allASVs$dframe %>% select(SampleKey,Depth,Longhurst_Short) %>%  distinct() %>% 
  ggplot(aes(Depth))+geom_histogram()

Code
dframe_allASVs$dframe %>% select(SampleKey,Depth,Longhurst_Short) %>%  distinct() %>% 
  ggplot(aes(Depth))+geom_boxplot()

Let’s slice the dephts of the ocean in 10%.

Code
distinct_depth <- dframe_allASVs$dframe %>% select(SampleKey,Depth) %>% distinct()
qtls_0.1 = quantile(distinct_depth$Depth, seq(0,1,0.1))
qtlsNames = 1:(length(qtls_0.1)-1)
qtlsNames = ifelse(qtlsNames<10,paste0('DGR0',qtlsNames),paste0('DGR',qtlsNames))
qtls_0.1
        0%        10%        20%        30%        40%        50%        60% 
   0.00000    1.18800   15.00000   28.80741   45.00000   61.50000   81.29718 
       70%        80%        90%       100% 
 105.47600  150.64307  335.50200 2568.80000 

Here are the depth quantiles.

Code
distinct_depth = distinct_depth %>% 
  mutate(CatDepth=cut(Depth,breaks = qtls_0.1,include.lowest = T,right = T,labels = qtlsNames))
        
distinct_depth %>% group_by(CatDepth) %>% 
  summarise(Freq=n())
# A tibble: 10 × 2
   CatDepth  Freq
   <fct>    <int>
 1 DGR01       99
 2 DGR02      115
 3 DGR03       82
 4 DGR04      112
 5 DGR05       86
 6 DGR06       98
 7 DGR07       99
 8 DGR08       98
 9 DGR09       99
10 DGR10       99
Code
dframe_allASVs$dframe %>% select(SampleKey,Depth,Longhurst_Short) %>% distinct() %>% 
  mutate(CatDepth=cut(Depth,breaks = qtls_0.1,include.lowest = T,right = T,labels = qtlsNames)) %>% 
  ggplot(aes(CatDepth))+geom_bar()+
  facet_wrap(~Longhurst_Short,scales='free_y')

This is the distribution of number of samples for each GDR (group depth rank for each longhurst province.

So our first tentative will be to create compositions of asvs with this amount of combinations (GDRxLH)

Code
LH_Depth_Composition_df=readRDS(file = paste0(savingdir,'/','LH_Depth_Composition_df'))

LH_Depth_dimRed <- dim_reduce_plots(
  dfComposition = LH_Depth_Composition_df,perplexityTsne = 100,
  neighUmap = 250)

LH_Depth_dimRed$MDS2d_ASVs_Ait

Code
LH_Depth_dimRed$TSNE_ASVs_Ait_2d

Code
LH_Depth_dimRed$umap_ASVs_Ait_2d

Code
Lat_Depth_Composition_df=readRDS(file = paste0(savingdir,'/','Lat_Depth_Composition_df'))

Lat_Depth_dimRed <- dim_reduce_plots(
  dfComposition = Lat_Depth_Composition_df,perplexityTsne = 100,
  neighUmap = 250)

Lat_Depth_dimRed$MDS2d_ASVs_Ait

Code
Lat_Depth_dimRed$TSNE_ASVs_Ait_2d

Code
Lat_Depth_dimRed$umap_ASVs_Ait_2d

#Creating Clusters

Let’s now create and evaluate the clusters using this two ocean slices that we have